/**********************************************************************/
/*
   Author: Karan Makkar, Michelle Han
   Created: December 2024
   Description: Merges and reshapes PMO data transfers:
		 1. JPAl_DATA_I: Batch 1 - 16
		 2. JPAL_DATA_II: Batch 1 - 16 randomization weights
		 3. JPAL_DATA_III: Batch 17 and weights
		 4. JPAL_DATA_VI: Batch 18 - 22 and weights

	 Outputs data at person level.

   Outputs:
   pmo_b1-22_raw_wide.dta

*/
/**********************************************************************/

	* Set Filepaths
	if "$master_run" !="1" include "./Do/SET_FILEPATHS.do"
	* Log
	cap log close
	local prefix: display %tdCYND td(`c(current_date)')
	log using "$KP_logs/`prefix'_PMO_merge_wide_b1-22", replace text

/*----------------------------------------------------*/
            /* Section: Append batches */
/*----------------------------------------------------*/

* load first batch
	u "$KP_deid_admin/Raw/JPAL_DATA_1/cleanedbatch_1.dta", clear
	gisid anon_id4

* merge in randomization weight
	merge 1:1 anon_id4 using "$KP_deid_admin/Raw/JPAL_DATA_2/batch_1.dta", assert(3) nogen
	assert !missing(bobot)

	gen batch = 1

* loop to append each batch
* Batch 18-22 does not have code_domisili, instead province_code/name, city_code/name
* Batch 18-22: final_test_score instead of test_score
* Batch 18:22 ticket_score instaed of bobot
	qui forval i = 2 / 22 {
			noi di "Appending Batch `i'"
			preserve

			if `i' <= 16 {
				u "$KP_deid_admin/Raw/JPAL_DATA_1/cleanedbatch_`i'.dta", clear
				merge 1:1 anon_id4 using "$KP_deid_admin/Raw/JPAL_DATA_2/batch_`i'.dta", assert(3) nogen
			}
			else if `i' == 17 {
				u "$KP_deid_admin/Raw/JPAL_DATA_3/cleanedbatch_17.dta", clear
				rename final_score bobot
			}
			else {
				u "$KP_deid_admin/Raw/JPAL_DATA_6/finalcleanedbatch_`i'.dta", clear
				rename final_test_score test_score
				rename ticket_score bobot
			}

		gisid anon_id4
		assert !missing(bobot)

		gen batch = `i'

		tempfile `i'
		qui save ``i''
		restore
		append using ``i''
	}

* Check uniqueness
	gisid anon_id4 batch
	gsort anon_id4 batch


/*----------------------------------------------------*/
	            /* Section: Clean */
/*----------------------------------------------------*/

* count number of appearancess
	tab batch, m
	gsort anon_id4 batch
	by anon_id4: gen appearances = _N
	by anon_id4 (batch): gen last_appearance = _n == _N
	tab appearances if last_appearance, m
	gstats summ appearances if last_appearance

* check missings
	count if mi(anon_hh_id)
	di `r(N)' / _N
	*assert anon_id4 != "" // This one fails. Look into it.
	assert batch != .
	assert has_passed_current_batch != .

* check that if user gets accepted, no longer applies
* NOTE: all the problematic observations are in batches 1-3
	gsort anon_id4 batch
	by anon_id4 (batch): gen appearance_num = _n
	gen select_batch_not_last = appearance_num != appearances if has_passed_current_batch == 1
	tab select_batch_not_last

	gsort anon_id4 batch
	by anon_id4: egen apply_after_selected = total(select_batch_not_last)
	tab apply_after_selected
	tab batch if apply_after_selected == 1

* is this the first batch individual applies in?
	gsort anon_id4
	by anon_id4: gegen first_apply_batch = min(batch)
	assert first_apply_batch != .
	by anon_id4 : gen ord = _n
	gen first_apply = batch == first_apply_batch
	tab first_apply_batch if ord == 1
	tab batch first_apply, row
	assert first_apply_batch != .

* check number of individuals applying from same HH
	gsort anon_hh_id batch
	by anon_hh_id batch: gen num_apply_in_batch_hh = _N if anon_hh_id != .
	gegen unique_applicants_hh = tag(anon_hh_id anon_id4)
	by anon_hh_id: gegen total_apply_hh = total(unique_applicants_hh)
	gen hh_applied = 0
	replace hh_applied = 1 if num_apply_in_batch_hh > 1

* does HH win in this batch?
	gsort anon_hh_id batch
	by anon_hh_id : replace ord = _n
	by anon_hh_id batch: gegen hh_wins_in_batch = total(has_passed_current_batch) if anon_hh_id != .
	tab batch hh_wins_in_batch if ord == 1, row

	preserve
	gsort anon_hh_id batch
	by anon_hh_id batch: replace ord = _n
	keep if ord == 1
	tab batch hh_wins_in_batch
	restore

	gen hh_win_in_batch = hh_wins_in_batch > 0 if !missing(hh_wins_in_batch)
	tab batch hh_win_in_batch if ord == 1, row

/*----------------------------------------------------*/
	            /* Section: Reshape */
/*----------------------------------------------------*/

* drop users that apply after being selected
// NOTE: may change handling of this later
	drop if apply_after_selected == 1
	gsort anon_id4

* Prep to reshape wide
	gen applied = 1
	rename aaa* aaa*_
	compress

* Reshape wide
	local varlist applied ///
	     anon_hh_id ///
	     has_passed_current_batch ///
	     first_apply_batch ///
	     bobot ///
	     year_dob ///
	     anon_month_dob ///
	     gender ///
	     education ///
	     test_score ///
			 hh_applied ///
			 hh_win_in_batch ///
			 aaa*_

	keep `varlist' anon_id4 batch

	greshape wide `varlist ', ///
		i(anon_id4) j(batch) nochecks benchmark

* Clean up data
* keep only first copy of duplicated variables
	foreach var in first_apply_batch year_dob anon_month_dob gender ///
		test_score education aaa1_ aaa2_ aaa3_ aaa4_ aaa5_ aaa6_ aaa7_ ///
		aaa8_ aaa9_ aaa20_ aaa21_ aaa22_ aaa23_ aaa24_ aaa25_ aaa26_ aaa27_ ///
		aaa31_ aaa32_ aaa33_ aaa34_ aaa35_ aaa36_ aaa37_ aaa38_ {
		  egen `var' = rowfirst(`var'*)
		  drop `var'?*
	}

	gen anon_hh_id = .
	gen win_batch = .

	qui forval i = 1 / 22 {
		replace anon_hh_id = anon_hh_id`i' if first_apply_batch == `i'
		replace win_batch = `i' if has_passed_current_batch`i' == 1
	}

	rename aaa*_ aaa*
	drop anon_hh_id?*

* Save raw merged reshaped data
datasignature 
if "`r(datasignature)'" == "23157486:144(24674):2193512926:3097620854" {
   save "$KP_deid_admin/Clean/pmo_b1-22_raw_wide.dta", replace
      }
else {
   di as err "Careful, your machine produces a different dataset"
   stop
		}
	

	cap log close


// DONE
